This dataset contains house sale prices for King County, which includes Seattle. It includes homes sold between May 2014 and May 2015.
Task: Try to estimate the price based on given features.

The term Boosting refers to a family of algorithms which converts weak learner to strong learners.
There are many boosting algorithms:
sklearn.ensemble.GradientBoostingRegressor
xgboost.XGBRegressor # fast and best
lightgbm.LGBMRegressor # extreme fast, little acc than xgb
catboost.CatBoostRegressor # good for categorical feats
import sys
ENV_BHISHAN = None
try:
import bhishan
print('Environment: Personal environment')
ENV_BHISHAN = True
%load_ext autoreload
%autoreload 2
except:
print('Module "bhishan" not found.')
import sys
ENV_COLAB = 'google.colab' in sys.modules
if ENV_COLAB:
#!pip install hpsklearn
!pip install shap eli5
!pip install catboost
!pip install ipywidgets
!jupyter nbextension enable --py widgetsnbextension
# set OMP_NUM_THREADS=1 for hpsklearn package
#!export OMP_NUM_THREADS=1
print('Environment: Google Colab')
import numpy as np
import pandas as pd
import seaborn as sns
sns.set(color_codes=True)
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import os
import time
# random state
SEED=100
np.random.seed(SEED) # we need this in each cell
# Jupyter notebook settings for pandas
pd.set_option('display.max_columns', 200)
# pd.set_option('display.float_format', '{:,.4f}'.format) # numbers sep by comma
pd.set_option('display.max_rows', 100) # None for all the rows
pd.set_option('display.max_colwidth', 200)
print([(x.__name__,x.__version__) for x in [np, pd,sns,matplotlib]])
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;
import scipy
import sklearn
print([(x.__name__,x.__version__) for x in [scipy, sklearn]])
# scale and split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
# pipeline
from sklearn.pipeline import Pipeline
# boosting
import xgboost as xgb
import lightgbm as lgb
import catboost
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBClassifier, DMatrix
from lightgbm import LGBMClassifier, Dataset
from catboost import CatBoostClassifier, Pool, CatBoost
print([(x.__name__,x.__version__) for x in [xgb, lgb,catboost]])
# six and pickle
import six
import pickle
import joblib
# metrics
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import explained_variance_score
# cross validation
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score, cross_val_predict
df_eval = pd.DataFrame({'Model': [],
'Details':[],
'Train Neg MSE K-Fold Cross Validation':[],
'Test RMSE':[],
'Test Explained Variance Score':[],
'Test R-squared':[],
'Test Adjusted R-squared':[],
})
# model evaluation using snap
import shap
# shap_values = shap.TreeExplainer(model_xgb).shap_values(Xtest)
# shap.summary_plot(shap_values, Xtest)
# shap.dependence_plot("column_name", shap_values, Xtest)
def show_method_attributes(obj, ncols=7,start=None, inside=None):
""" Show all the attributes of a given method.
Example:
========
show_method_attributes(list)
"""
print(f'Object Type: {type(obj)}\n')
lst = [elem for elem in dir(obj) if elem[0]!='_' ]
lst = [elem for elem in lst
if elem not in 'os np pd sys time psycopg2'.split() ]
if isinstance(start,str):
lst = [elem for elem in lst if elem.startswith(start)]
if isinstance(start,tuple) or isinstance(start,list):
lst = [elem for elem in lst for start_elem in start
if elem.startswith(start_elem)]
if isinstance(inside,str):
lst = [elem for elem in lst if inside in elem]
if isinstance(inside,tuple) or isinstance(inside,list):
lst = [elem for elem in lst for inside_elem in inside
if inside_elem in elem]
return pd.DataFrame(np.array_split(lst,ncols)).T.fillna('')
def adjustedR2(rsquared,nrows,kcols):
"""
Adjusted r-squared depends on number of rows and columns of Test data.
It reduces the value of original r-squared value.
"""
return rsquared- (kcols-1)/(nrows-kcols) * (1-rsquared)
# df_clean = pd.read_csv('../data/processed/data_cleaned_encoded.csv')
ifile = 'https://github.com/bhishanpdl/Project_House_Price_Prediction/blob/master/data/processed/data_cleaned_encoded.csv?raw=true'
df_clean = pd.read_csv(ifile)
print(df_clean.shape)
df_clean.head()
# I will just take column names from this and will use cleaned data further.
# df_raw = pd.read_csv('../data/raw/kc_house_data.csv')
df_raw = pd.read_csv('https://github.com/bhishanpdl/Project_House_Price_Prediction/blob/master/data/raw/kc_house_data.csv?raw=true',nrows=1)
df_raw.columns
features_raw_all = ['bedrooms', 'bathrooms', 'sqft_living',
'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
'lat', 'long', 'sqft_living15', 'sqft_lot15']
df = df_clean[features_raw_all + ['price']]
dict_features = dict(enumerate(features_raw_all))
print(dict_features)
df.head(2)
log_cols = ['price','sqft_living','sqft_living15','sqft_lot','sqft_lot15']
for col in log_cols:
df[col] = np.log1p(df[col].to_numpy())
X = df[features_raw_all].to_numpy()
y = df['price'].to_numpy()
Xtrain_orig, Xtest, ytrain_orig, ytest = train_test_split (X,y,
test_size=0.20,
random_state=SEED)
df_Xtrain_orig = pd.DataFrame(Xtrain_orig, columns=features_raw_all)
df_Xtest = pd.DataFrame(Xtest, columns=features_raw_all)
Xtrain_orig.shape, ytrain_orig.shape, Xtest.shape, ytest.shape, Xtrain_orig[0][:2], Xtest[0][:2]
Xtrain, Xvalid, ytrain, yvalid = train_test_split(Xtrain_orig, ytrain_orig,
random_state=SEED, test_size=0.2)
df_Xtrain = pd.DataFrame(Xtrain, columns=features_raw_all)
df_Xvalid = pd.DataFrame(Xvalid, columns=features_raw_all)
df_Xtrain.head()
cols_int = ['bedrooms','waterfront','view','condition','grade','zipcode']
for c in cols_int:
df_Xtrain[c] = df_Xtrain[c].astype(int)
df_Xtest[c] = df_Xtest[c].astype(int)
df_Xvalid[c] = df_Xvalid[c].astype(int)
df_Xtrain.head()
scaler = StandardScaler() # standard scaler better for regression
scaler.fit(Xtrain_orig)
https://catboost.ai/docs/concepts/python-reference_catboostregressor.html
class CatBoostRegressor(iterations=None,learning_rate=None,depth=None,
l2_leaf_reg=None,model_size_reg=None,rsm=None,loss_function='RMSE',
border_count=None,feature_border_type=None
per_float_feature_quantization=None,input_borders=None,
output_borders=None,fold_permutation_block=None,od_pval=None,
od_wait=None,od_type=None,nan_mode=None,counter_calc_method=None,
leaf_estimation_iterations=None,leaf_estimation_method=None,
thread_count=None,random_seed=None,use_best_model=None,
best_model_min_trees=None,verbose=None,silent=None,logging_level=None,
metric_period=None,ctr_leaf_count_limit=None,store_all_simple_ctr=None,
max_ctr_complexity=None,
has_time=None,allow_const_label=None,one_hot_max_size=None,
random_strength=None,name=None,ignored_features=None,
train_dir=None,custom_metric=None,eval_metric=None,
bagging_temperature=None,save_snapshot=None,
snapshot_file=None,snapshot_interval=None,
fold_len_multiplier=None,used_ram_limit=None,gpu_ram_part=None,
pinned_memory_size=None,allow_writing_files=None,
final_ctr_computation_mode=None,approx_on_full_history=None,
boosting_type=None,simple_ctr=None,combinations_ctr=None,
per_feature_ctr=None,ctr_target_border_count=None,task_type=None,
device_config=None,devices=None,bootstrap_type=None,subsample=None,
sampling_unit=None,dev_score_calc_obj_block_size=None,
max_depth=None,n_estimators=None,num_boost_round=None,
num_trees=None,colsample_bylevel=None,random_state=None,
reg_lambda=None,objective=None,eta=None,max_bin=None,
gpu_cat_features_storage=None,data_partition=None,
metadata=None,early_stopping_rounds=None,cat_features=None,
grow_policy=None,min_data_in_leaf=None,min_child_samples=None,
max_leaves=None,num_leaves=None,score_function=None,
leaf_estimation_backtracking=None,ctr_history_unit=None,
monotone_constraints=None)
import catboost
show_method_attributes(catboost)
from catboost import CatBoostRegressor, Pool
show_method_attributes(CatBoostRegressor)
# help(CatBoostRegressor)
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score, cross_val_predict
from catboost import CatBoostRegressor
# time
time_start = time.time()
# current parameters
desc = 'default,log+standard scaling,random_state=0, numpy'
Xtr = scaler.transform(Xtrain)
Xtx = scaler.transform(Xtest)
ytr,ytx = ytrain, ytest
"""
default,log+standard scaling,random_state=0
numpy : Xtrain_scaled, Xtest_scaled ==> 0.900584
pandas: df_Xtrain_scaled, df_Xtest_scaled ==> -0.054372
"""
# fit the model
model_cat = CatBoostRegressor(verbose=1000,random_state=0)
model_cat.fit(Xtr, ytr)
# fitted model
model = model_cat
# save the model
# joblib.dump(model_cat, 'model_cat.pkl')
# model_cat = joblib.load('model_cat.pkl')
# ypreds
kf=KFold(n_splits=5,shuffle=True,random_state=SEED)
ypreds = cross_val_predict(model, Xtx, ytx, cv=kf)
# train validation
cvs = cross_val_score(model, Xtr, ytr,cv=kf,
scoring = "neg_mean_squared_error")
score = cvs.mean()
# rmse
rmse = np.sqrt(sklearn.metrics.mean_squared_error(ytx,ypreds))
# expalined variance
evs = explained_variance_score(ytx, ypreds)
# r-squared values
r2 = sklearn.metrics.r2_score(ytx, ypreds)
ar2 = adjustedR2(r2, Xtx.shape[0], Xtx.shape[1])
row_eval = ['catboost',
desc,
score,rmse,evs,r2,ar2]
df_eval.loc[len(df_eval)] = row_eval
df_eval = df_eval.drop_duplicates()
# time
time_taken = time.time() - time_start
print('Time taken: {:.0f} min {:.0f} secs'.format(*divmod(time_taken,60)))
# results
display(df_eval)
# show_method_attributes(model)
# model.get_all_params()
df_Xtrain.head(2)
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score, cross_val_predict
from catboost import CatBoostRegressor
# time
time_start = time.time()
# current parameters
Xtr = Xtrain
Xtx = Xtest
Xvd = Xvalid
ytr,ytx,yvd = ytrain, ytest,yvalid
# fit the model
model = CatBoostRegressor(random_state=0,verbose=1000)
model.fit(Xtr, ytr,
eval_set=(Xvd, yvd))
# ypreds
kf=KFold(n_splits=5,shuffle=True,random_state=SEED)
ypreds = cross_val_predict(model, Xtx, ytx, cv=kf)
# r-squared values
r2 = sklearn.metrics.r2_score(ytx, ypreds)
ar2 = adjustedR2(r2, Xtx.shape[0], Xtx.shape[1])
# time
time_taken = time.time() - time_start
print('Time taken: {:.0f} min {:.0f} secs'.format(*divmod(time_taken,60)))
print('Adusted R-squared value ', ar2)
df_Xtrain.head(5)
df_Xtrain2 = df_Xtrain.copy()
df_Xtest2 = df_Xtest.copy()
df_Xvalid2 = df_Xvalid.copy()
# cols_cat = ['bedrooms','bathrooms','floors','waterfront',
# 'view','condition','grade','zipcode']
# cols_cat = ['waterfront','view','condition','grade','zipcode']
cols_cat = ['waterfront','view','condition','grade']
for c in cols_cat:
df_Xtrain2[c] = df_Xtrain2[c].astype(int)
df_Xtest2[c] = df_Xtest2[c].astype(int)
df_Xvalid2[c] = df_Xvalid2[c].astype(int)
print(df_Xtrain2.shape, df_Xtest2.shape)
df_Xtrain2.head(2).append(df_Xtest2.head(2))
cat_features_idx = [df_Xtrain2.columns.get_loc(c) for c in cols_cat]
cat_features_idx
# time
time_start = time.time()
# current parameters
Xtr = df_Xtrain2
Xtx = df_Xtest2
ytr,ytx = ytrain, ytest
Xvd = df_Xvalid2
yvd = yvalid
# fit the model
model = CatBoostRegressor(random_state=0,verbose=1000)
model.fit(Xtr, ytr,
eval_set=(Xvd,yvd),
cat_features=cat_features_idx,
use_best_model=True
)
# ypreds
kf=KFold(n_splits=5,shuffle=True,random_state=SEED)
ypreds = cross_val_predict(model, Xtx, ytx, cv=kf)
# r-squared values
r2 = sklearn.metrics.r2_score(ytx, ypreds)
ar2 = adjustedR2(r2, Xtx.shape[0], Xtx.shape[1])
# time
time_taken = time.time() - time_start
print('Time taken: {:.0f} min {:.0f} secs'.format(*divmod(time_taken,60)))
print('Adusted R-squared value ', ar2)
df_Xtrain.head()
model = catboost.CatBoostRegressor(cat_features=['bedrooms', 'view','grade','condition'],
one_hot_max_size=300, iterations=500)
model.fit(df_Xtrain, ytrain, silent=True)
# float feature
feature_name = 'sqft_living'
dict_stats = model.calc_feature_statistics(df_Xtrain, ytrain, feature_name, plot=True)
# one hot feature
feature_name = 'bedrooms'
cat_vals = df_Xtrain[feature_name].unique().tolist()
dict_stats = model.calc_feature_statistics(df_Xtrain, ytrain, feature_name)
dict_stats.keys()
for key in dict_stats.keys():
print(key, len(dict_stats[key]))
df_Xtrain['bedrooms'].unique()
df_Xtrain['bedrooms'].nunique()
# show_method_attributes(model)
# feature importance
df_imp = pd.DataFrame({'Feature': features_raw_all,
'Importance': model.feature_importances_
})
df_imp.sort_values('Importance',ascending=False).style.background_gradient()
def plot_feature_imp_catboost(model_catboost,features):
"""Plot the feature importance horizontal bar plot.
"""
df_imp = pd.DataFrame({'Feature': model.feature_names_,
'Importance': model.feature_importances_
})
df_imp = df_imp.sort_values('Importance').set_index('Feature')
ax = df_imp.plot.barh(figsize=(12,8))
plt.grid(True)
plt.title('Feature Importance',fontsize=14)
ax.get_legend().remove()
for p in ax.patches:
x = p.get_width()
y = p.get_y()
text = '{:.2f}'.format(p.get_width())
ax.text(x, y,text,fontsize=15,color='indigo')
plt.show()
plot_feature_imp_catboost(model, features_raw_all)
df_fimp = model.get_feature_importance(prettified=True)
plt.figure(figsize=(12,8))
ax = sns.barplot(x="Importances", y="Feature Id", data=df_fimp);
for p in ax.patches:
x = p.get_width()
y = p.get_y()
text = '{:.2f}'.format(p.get_width())
ax.text(x, y,text,fontsize=15,color='indigo',va='top',ha='left')
show_method_attributes(model)
# model.plot_tree(4)
# WARNING: This crashes the google colab, do not use it (Nov 23, 2019)
# this takes long time maybe 1 or 2 hours, I did not waited,
# but did not crash until about 10 minutes.
# Xtr = Xtrain
# Xtx = Xtest
# ytr,ytx = ytrain, ytest
# pool = Pool(Xtr, ytr, cat_features=[], feature_names=features_raw_all)
# model = CatBoostClassifier(
# max_depth=2, verbose=False, max_ctr_complexity=1, iterations=2).fit(pool)
# model.plot_tree(
# tree_idx=0,
# pool=pool
# )
import catboost
from catboost import CatBoostClassifier
# part 1: fit the model
cat_features = [0,1,2]
train_data = [["a", "b", 1, 4, 5, 6],
["a", "b", 4, 5, 6, 7],
["c", "d", 30, 40, 50, 60]]
train_labels = [1,1,0]
model = CatBoostClassifier(iterations=20,
loss_function = "CrossEntropy",
train_dir = "crossentropy")
model.fit(train_data, train_labels, cat_features)
predictions = model.predict(train_data)
# part 2: visualize
w = catboost.MetricVisualizer('/crossentropy/')
w.start()
Part 1 works in google colab and gives some files in the directory crossentroy but part2 keeps running for infinite time.
from catboost import CatBoost, Pool
df_Xtrain.head(2)
cat_features = [] # take it empty for the moment
dtrain = Pool(Xtrain, ytrain, cat_features=cat_features)
dvalid = Pool(Xvalid, yvalid, cat_features=cat_features)
dtest = Pool(Xtest, ytest, cat_features=cat_features)
params = {'iterations': 100, 'verbose': False, 'random_seed': 0}
bst_cat = CatBoost(params=params)
bst_cat.fit(dtrain);
print(bst_cat.eval_metrics(dvalid, ['RMSE'])['RMSE'][-1])
# show_method_attributes(bst_cat)
ypreds = bst_cat.predict(dtest)
# r-squared values
r2 = sklearn.metrics.r2_score(ytest, ypreds)
ar2 = adjustedR2(r2, Xtx.shape[0], Xtx.shape[1])
print('Adusted R-squared value ', ar2)
# bst_cat.plot_tree(tree_idx=0) # !!! Crashes Google Colab!!!
cv(pool=None, params=None, dtrain=None, iterations=None,
num_boost_round=None, fold_count=None, nfold=None, inverted=False,
partition_random_seed=0, seed=None, shuffle=True, logging_level=None,
stratified=None, as_pandas=True, metric_period=None, verbose=None,
verbose_eval=None, plot=False, early_stopping_rounds=None,
save_snapshot=None, snapshot_file=None,
snapshot_interval=None, folds=None, type='Classical')
# help(catboost.cv)
params = {'iterations': 100, 'verbose': False, 'random_seed': 0}
df_scores = catboost.cv(dtrain,
params,
fold_count=2,
verbose=100,
plot="True") # plot does not work in google colab
print(df_scores.columns)
df_scores.head()
fig, ax = plt.subplots(1,1,figsize=(12,8))
sns.lineplot(x='iterations',y='train-RMSE-mean',data=df_scores,ax=ax,color='r')
sns.lineplot(x='iterations',y='test-RMSE-mean',data=df_scores,ax=ax,color='b',alpha=0.2,linewidth=5,linestyle='--')
import shap
# fit the model
model = CatBoostRegressor(verbose=1000,random_state=0)
model.fit(df_Xtrain, ytr)
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(df_Xtest)
shap_values.shape, df_Xtest.shape, df_Xtrain.shape
df_Xtest.head(1)
df_Xtest.head(1)[['yr_built','sqft_living','lat','grade']]
# load JS visualization code to notebook
shap.initjs()
# Look only first row of test data
# use matplotlib=True to avoid Javascript
shap.force_plot(explainer.expected_value,
shap_values[0,:],
df_Xtest.iloc[0,:],
matplotlib=False,
text_rotation=90)
# the prediction for first row is 12.66 which is due to alll columns.
#
# red features contribute positive, blue features contribute negative.
# here, first row has sqft_living = 7.799, which is a good value it makes prediction higher
# but, lat = 47.34 makes the label prediction lower
# load JS visualization code to notebook
shap.initjs()
# visualize the test set predictions
shap.force_plot(explainer.expected_value, shap_values, df_Xtest,matplotlib=False)
shap.summary_plot(shap_values, df_Xtest)
This plot is made of many dots. Each dot has three characteristics:
For example, the point in the upper right was for a team that has high latitude, increasing the prediction by 2.0.
shap.summary_plot(shap_values, df_Xtest, plot_type='bar')
shap.dependence_plot("sqft_living", shap_values, df_Xtest)
shap.dependence_plot("view", shap_values, df_Xtest)
shap.dependence_plot(ind='sqft_living', interaction_index='sqft_living15',
shap_values=shap_values,
features=df_Xtest,
display_features=df_Xtest)
We generally should optimize model complexity and then tune the convergence.
model complexity: max_depth etc
convergence: learning rate
Parameters:
Xtr = scaler.transform(Xtrain)
Xtx = scaler.transform(Xtest)
ytr,ytx = ytrain, ytest
# fit the model
model_cat = CatBoostRegressor(verbose=1000,random_state=0)
model_cat.fit(Xtr, ytr)
# fitted model
model = model_cat
# ypreds
kf=KFold(n_splits=5,shuffle=True,random_state=SEED)
ypreds = cross_val_predict(model, Xtx, ytx, cv=kf)
# r-squared values
r2 = sklearn.metrics.r2_score(ytx, ypreds)
ar2 = adjustedR2(r2, Xtx.shape[0], Xtx.shape[1])
print('Adjusted R-squared value for test', ar2)
# show_method_attributes(model)
# model.get_all_params()
"""
First check some iterations:
1000 = 0.9091745470144424 # default iterations = 1000
2000 = 0.9121352344981736
3000 = 0.9126262506236912
4000 = 0.9123204650792888
"""
time_start = time.time()
model = CatBoostRegressor(verbose=False,random_state=0,iterations=3_000)
model.fit(Xtr, ytr)
ypreds = model.predict(Xtx)
r2 = sklearn.metrics.r2_score(ytx, ypreds)
ar2 = adjustedR2(r2, Xtx.shape[0], Xtx.shape[1])
time_taken = time.time() - time_start
print('Time taken: {:.0f} min {:.0f} secs'.format(*divmod(time_taken,60)))
print('Adjusted R-squared value for test', ar2)
for n in [7]: # default detpth = 6
model = CatBoostRegressor(verbose=False,random_state=0,
iterations=3_000,
depth=n,
)
model.fit(Xtr, ytr)
ypreds = model.predict(Xtx)
r2 = sklearn.metrics.r2_score(ytx, ypreds)
ar2 = adjustedR2(r2, Xtx.shape[0], Xtx.shape[1])
print( round(n,6), round(ar2,6))
"""
2 0.900063
3 0.908837
4 0.911563
5 0.912822
6 0.912626
7 0.912898 **best
8 0.911368
9 0.909902
10 0.907586
""";
for n in [3]: # default l2_leaf_reg = 3
model = CatBoostRegressor(verbose=False,random_state=0,
iterations=3_000,
depth=7,
l2_leaf_reg=n,
)
model.fit(Xtr, ytr)
ypreds = model.predict(Xtx)
r2 = sklearn.metrics.r2_score(ytx, ypreds)
ar2 = adjustedR2(r2, Xtx.shape[0], Xtx.shape[1])
print( round(n,6), round(ar2,6))
"""
1 0.912553
2 0.912031
3 0.912898 ** best
4 0.912124
5 0.912581
6 0.912447
7 0.912124
8 0.912724
9 0.912262
""";
model = CatBoostRegressor(verbose=False,random_state=0,
iterations=3_000,
depth=7,
)
# do not fit the model here
# let the grid search fit itself.
grid = {'learning_rate': [0.03, 0.1], # 0.02999999933
'subsample': [0.8, 0.7, 0.6,0.85,0.9,1], # default 0.8
}
dict_grid_search_result = model.grid_search(grid,
X=Xtr,
y=ytr,
plot=True)
dict_grid_search_result.keys()
dict_grid_search_result['params']
dict_cv_results = dict_grid_search_result['cv_results']
dict_cv_results.keys()
df_grid_search_cv = pd.DataFrame(dict_cv_results)
df_grid_search_cv.head()
sns.lineplot(x='iterations',y='test-RMSE-mean', data=df_grid_search_cv)
model = CatBoostRegressor(verbose=False,random_state=0,
iterations=3_000,
depth=7,
l2_leaf_reg=3,
learning_rate= 0.03,
subsample= 1, # grid search subsamples gave me lower result
)
model.fit(Xtr, ytr)
ypreds = model.predict(Xtx)
r2 = sklearn.metrics.r2_score(ytx, ypreds)
ar2 = adjustedR2(r2, Xtx.shape[0], Xtx.shape[1])
print( round(ar2,6))
model = CatBoostRegressor(verbose=False,random_state=0,
iterations=3_000,
depth=7,
l2_leaf_reg=3,
learning_rate= 0.03,
subsample= 0.8, # default
)
model.fit(Xtr, ytr)
ypreds = model.predict(Xtx)
r2 = sklearn.metrics.r2_score(ytx, ypreds)
ar2 = adjustedR2(r2, Xtx.shape[0], Xtx.shape[1])
print( round(ar2,6))
df_Xtrain.head(2)
df_Xtrain.head(2)
params = dict(verbose=500,
random_state=0,
iterations=3_000,
depth=7,
l2_leaf_reg=3,
learning_rate= 0.03,
subsample= 0.8,
eval_metric='RMSE',
cat_features = ['bedrooms','view','condition',],
early_stopping_rounds=200,
)
model = catboost.CatBoostRegressor(**params)
model.fit(df_Xtrain, ytrain,
eval_set=(df_Xvalid, yvalid),
use_best_model=True,
plot=False
);
# show_method_attributes(model)
model.get_params()
params = model.get_params()
params['iterations'] = 2503
params['cat_features'] = []
model = catboost.CatBoostRegressor(**params)
model.fit(Xtr, ytr)
ypreds = model.predict(Xtx)
r2 = sklearn.metrics.r2_score(ytx, ypreds)
ar2 = adjustedR2(r2, Xtx.shape[0], Xtx.shape[1])
print( round(ar2,6))
df_Xtrain.head(2)
"""
all categories except float bathrooms and floors = 0.913583
"""
Xtr = df_Xtrain
Xtx = df_Xtest
params ={'depth': 7,
'early_stopping_rounds': 200,
'eval_metric': 'RMSE',
'iterations': 3000,
'l2_leaf_reg': 3,
'learning_rate': 0.03,
'loss_function': 'RMSE',
'random_state': 0,
'subsample': 0.8,
'verbose': 500}
params['iterations'] = 2503
lst_cat_features = ['bedrooms','waterfront','view','condition','grade','zipcode']
params['cat_features'] = lst_cat_features
model = catboost.CatBoostRegressor(**params)
model.fit(Xtr, ytr)
ypreds = model.predict(Xtx)
r2 = sklearn.metrics.r2_score(ytx, ypreds)
ar2 = adjustedR2(r2, Xtx.shape[0], Xtx.shape[1])
print( round(ar2,6))
non_cat_features = df_Xtrain.columns.drop(lst_cat_features)
non_cat_features
cols_log = [ 'sqft_living', 'sqft_lot', 'floors', 'sqft_above',
'sqft_basement',
'sqft_living15', 'sqft_lot15']
df_Xtrain_log = df_Xtrain.copy()
df_Xtest_log = df_Xtest.copy()
df_Xvalid_log = df_Xvalid.copy()
df_Xtrain.isnull().sum().sum(), df_Xtest.isnull().sum().sum(), df_Xvalid.isnull().sum().sum()
for c in cols_log:
df_Xtrain_log[c] = df_Xtrain_log[c].to_numpy().astype(float)
df_Xtest_log[c] = df_Xtest_log[c].to_numpy().astype(float)
df_Xvalid_log[c] = df_Xvalid_log[c].to_numpy().astype(float)
df_Xtrain_log[c] = np.log1p(df_Xtrain_log[c].to_numpy())
df_Xtest_log[c] = np.log1p(df_Xtest_log[c].to_numpy())
df_Xvalid_log[c] = np.log1p(df_Xvalid_log[c].to_numpy())
Xtr = df_Xtrain_log
Xtx = df_Xtest_log
params = {'depth': 7,
'early_stopping_rounds': 200,
'eval_metric': 'RMSE',
'iterations': 2503,
'l2_leaf_reg': 3,
'learning_rate': 0.03,
'loss_function': 'RMSE',
'random_state': 0,
'subsample': 0.8,
'verbose': 500
}
lst_cat_features = ['bedrooms','waterfront','view','condition','grade','zipcode']
params['cat_features'] = lst_cat_features
"""
all categories except float bathrooms and floors = 0.913582
log transform few features = 0.913582
"""
model = catboost.CatBoostRegressor(**params)
model.fit(Xtr, ytr)
ypreds = model.predict(Xtx)
r2 = sklearn.metrics.r2_score(ytx, ypreds)
ar2 = adjustedR2(r2, Xtx.shape[0], Xtx.shape[1])
print( round(ar2,6))
for c in cols_log:
df_Xtrain_log[c] = (df_Xtrain_log[c] - df_Xtrain_log[c].mean()) / df_Xtrain_log[c].std()
df_Xtest_log[c] = (df_Xtest_log[c] - df_Xtest_log[c].mean()) / df_Xtest_log[c].std()
"""
all categories except float bathrooms and floors = 0.913583
log transform few features = 0.90657
log transform few features + normalize = 0.913443
For tree based model, log+scaling did not help at all. Instead reduced acc.
"""
Xtr = df_Xtrain_log
Xtx = df_Xtest_log
params = {'depth': 7,
'early_stopping_rounds': 200,
'eval_metric': 'RMSE',
'iterations': 2503,
'l2_leaf_reg': 3,
'learning_rate': 0.03,
'loss_function': 'RMSE',
'random_state': 0,
'subsample': 0.8,
'verbose': 500
}
lst_cat_features = ['bedrooms','waterfront','view','condition','grade','zipcode']
params['cat_features'] = lst_cat_features
model = catboost.CatBoostRegressor(**params)
model.fit(Xtr, ytr)
ypreds = model.predict(Xtx)
r2 = sklearn.metrics.r2_score(ytx, ypreds)
ar2 = adjustedR2(r2, Xtx.shape[0], Xtx.shape[1])
print( round(ar2,6))
from sklearn.preprocessing import PolynomialFeatures
polyfeat = PolynomialFeatures(degree=2, interaction_only=True)
Xtrain_poly = polyfeat.fit_transform(df_Xtrain)
Xtest_poly = polyfeat.fit_transform(df_Xtest)
Xtrain.shape, Xtest.shape, Xtrain_poly.shape, Xtest_poly.shape
lst_names = polyfeat.get_feature_names()
lst_names
pd.DataFrame(Xtrain_poly[:5,:])
df_Xtrain.head(2)
"""
deg 2 = 0.90771
deg 2, interaction only = 0.90655
"""
Xtr = Xtrain_poly
Xtx = Xtest_poly
params = {'depth': 7,
'early_stopping_rounds': 200,
'eval_metric': 'RMSE',
'l2_leaf_reg': 3,
'learning_rate': 0.03,
'loss_function': 'RMSE',
'random_state': 0,
'subsample': 0.8,
'verbose': 500,
'iterations': 2503
}
model = catboost.CatBoostRegressor(**params)
model.fit(Xtr, ytr)
ypreds = model.predict(Xtx)
r2 = sklearn.metrics.r2_score(ytx, ypreds)
ar2 = adjustedR2(r2, Xtx.shape[0], Xtx.shape[1])
print( round(ar2,6))
for n in [0, 42, 100, 314, 31416, 123, 12345, 111]:
Xtr = df_Xtrain
Xtx = df_Xtest
params ={'depth': 7,
'early_stopping_rounds': 200,
'eval_metric': 'RMSE',
'iterations':2503,
'l2_leaf_reg': 3,
'learning_rate': 0.03,
'loss_function': 'RMSE',
'random_state': n,
'subsample': 0.8,
'verbose': False}
lst_cat_features = ['bedrooms','waterfront','view','condition','grade','zipcode']
params['cat_features'] = lst_cat_features
model = catboost.CatBoostRegressor(**params)
model.fit(Xtr, ytr)
ypreds = model.predict(Xtx)
r2 = sklearn.metrics.r2_score(ytx, ypreds)
ar2 = adjustedR2(r2, Xtx.shape[0], Xtx.shape[1])
print( n, round(ar2,6))
Xtr = df_Xtrain
Xtx = df_Xtest
params ={'depth': 7,
'early_stopping_rounds': 200,
'eval_metric': 'RMSE',
'iterations':2503,
'l2_leaf_reg': 3,
'learning_rate': 0.03,
'loss_function': 'RMSE',
'random_state': 123,
'subsample': 0.8,
'verbose': False}
lst_cat_features = ['bedrooms','waterfront','view','condition','grade','zipcode']
params['cat_features'] = lst_cat_features
model = catboost.CatBoostRegressor(**params)
model.fit(Xtr, ytr)
ypreds = model.predict(Xtx)
r2 = sklearn.metrics.r2_score(ytx, ypreds)
ar2 = adjustedR2(r2, Xtx.shape[0], Xtx.shape[1])
print( n, round(ar2,6))
plot_feature_imp_catboost(model, features_raw_all)
import eli5
eli5.show_weights(model)
df_fimp = model.get_feature_importance(prettified=True)[['Importances','Feature Id']]
df_fimp.style.background_gradient(subset=['Importances'])